This scripts plots the coverage of 98k random positions, based on mpileup with indels removed. # Autosomal coverage
# cov is a sync file after removind indels
cov <- read.table("~/sex_ratio_EE/results/5.create_sync_file/autosomes/EE_SR_IndelsRm_RepeatsRm_autosomes_subsampled.sync")
# remove N and del (last 2 digits from each cell)
remove_last_digits <- function(x) {
gsub(".{0,4}$", "", x)
}
cov[4:ncol(cov)] <- lapply(cov[4:ncol(cov)], remove_last_digits)
# add col names
sample_names <- read.table("~/sex_ratio_EE/data/sample_info.txt", sep = "\t", h = T)[,5]
colnames(cov) <- c("LG", "pos", "ref_allele", sample_names)
# sum values within each cell (sum the coverage of all nucleotides)
sum_split_values <- function(x) {
sapply(strsplit(x, ":"), function(y) sum(as.numeric(y)))
}
# apply the function
cov <- cov %>%
mutate(across(contains("males"), sum_split_values))
# change the format to longer
cov_longer_autosomes <- cov %>%
arrange(LG, pos) %>% # sort the mpileup according to position
pivot_longer(cols = tidyselect::contains("male"), # change format to longer
names_to = c("line", "sex", "generation"),
names_sep = "_",
values_to = "coverage")%>%
mutate(line_generation = paste0(line, "_", generation, "_", sex))
Calculate mean coverage per line
# calculate mean coverage per line:
cov_longer_autosomes %>%
group_by(line, generation, sex) %>%
summarise(mean_coverage = mean(coverage))
## `summarise()` has grouped output by 'line', 'generation'. You can override
## using the `.groups` argument.
## # A tibble: 32 × 4
## # Groups: line, generation [16]
## line generation sex mean_coverage
## <chr> <chr> <chr> <dbl>
## 1 FB-A F1 females 47.9
## 2 FB-A F1 males 29.5
## 3 FB-A F28 females 53.5
## 4 FB-A F28 males 48.4
## 5 FB-B F1 females 40.9
## 6 FB-B F1 males 32.5
## 7 FB-B F28 females 59.4
## 8 FB-B F28 males 50.4
## 9 FB-C F1 females 47.5
## 10 FB-C F1 males 30.2
## # ℹ 22 more rows
Plot raw coverage of each line
# plot coverage
cov_plot <- ggplot(cov_longer_autosomes, aes(coverage)) +
geom_density(aes(color = line_generation)) +
geom_density(linewidth = 1.5) +
xlim(0,200) +
#ylim(0,0.055) +
scale_color_viridis(discrete = T) +
theme(legend.position = "none")+
#facet_wrap(~generation) +
ggtitle("Before summing coverage across sexes; autosomes only")
plot(cov_plot)
## Warning: Removed 1881 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 1881 rows containing non-finite outside the scale range
## (`stat_density()`).
# convert to a plotly plot
interactive_cov_plot <- ggplotly(cov_plot)
## Warning: Removed 1881 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 1881 rows containing non-finite outside the scale range
## (`stat_density()`).
interactive_cov_plot
# plot coverage (only female samples)
cov_longer_autosomes %>%
filter(sex == "females") %>%
ggplot(aes(coverage)) +
geom_density(aes(color = line_generation)) +
geom_density(linewidth = 1.5) +
xlim(0,200) +
#ylim(0,0.055) +
scale_color_viridis(discrete = T) +
theme(legend.position = "none")+
#facet_wrap(~generation) +
ggtitle("Before summing coverage across sexes; female autosomes only")
## Warning: Removed 1034 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 1034 rows containing non-finite outside the scale range
## (`stat_density()`).
The target coverage for autosomes is 100.98 (vertical line) Therefore, the mean coverage across all lines has to be between 50X and 201X.
# cov_check is a sync file after merging male and female coverage
cov_merged <- read.table("~/sex_ratio_EE/results/5.create_sync_file/autosomes/EE_SR_IndelsRm_RepeatsRm_sexMergedCov_autosomes_subsampled.sync")
# remove N and del (last 2 digits from each cell)
cov_merged[4:ncol(cov_merged)] <- lapply(cov_merged[4:ncol(cov_merged)], remove_last_digits)
# add col names
sample_names_check <- unique(c("FB-A_F1", "FB-A_F1", "FB-B_F1", "FB-B_F1", "FB-C_F1", "FB-C_F1" , "FB-D_F1", "FB-D_F1", "MB-A_F1", "MB-A_F1", "MB-B_F1", "MB-B_F1", "MB-C_F1", "MB-C_F1", "MB-D_F1", "MB-D_F1", "FB-A_F28", "FB-A_F28", "FB-B_F28", "FB-B_F28", "FB-C_F28", "FB-C_F28" , "FB-D_F28", "FB-D_F28", "MB-A_F28", "MB-A_F28", "MB-B_F28", "MB-B_F28", "MB-C_F28", "MB-C_F28", "MB-D_F28", "MB-D_F28"))
colnames(cov_merged) <- c("LG", "pos", "ref_allele", sample_names_check)
# sum values within each cell (sum the coverage of all nucleotides)
cov_merged <- cov_merged %>%
mutate(across(contains("_F"), sum_split_values))
# change the format to longer
cov_merged_longer_autosomes <- cov_merged %>%
arrange(LG, pos) %>% # sort the mpileup according to position
pivot_longer(cols = tidyselect::contains("_F"), # change format to longer
names_to = c("line", "generation"),
names_sep = "_",
values_to = "coverage")%>%
mutate(line_generation = paste0(line, "_", generation))
cov_merged_longer_autosomes %>%
group_by(line, generation) %>%
summarise(mean_cov = mean(coverage))
## `summarise()` has grouped output by 'line'. You can override using the
## `.groups` argument.
## # A tibble: 16 × 3
## # Groups: line [8]
## line generation mean_cov
## <chr> <chr> <dbl>
## 1 FB-A F1 77.4
## 2 FB-A F28 102.
## 3 FB-B F1 73.1
## 4 FB-B F28 109.
## 5 FB-C F1 77.5
## 6 FB-C F28 89.1
## 7 FB-D F1 59.1
## 8 FB-D F28 92.0
## 9 MB-A F1 63.5
## 10 MB-A F28 94.7
## 11 MB-B F1 59.5
## 12 MB-B F28 107.
## 13 MB-C F1 75.0
## 14 MB-C F28 106.
## 15 MB-D F1 87.4
## 16 MB-D F28 102.
# plot coverage
plot_cov_merged_longer_autosomes <- ggplot(cov_merged_longer_autosomes, aes(coverage)) +
geom_density(aes(color = line_generation)) +
geom_density(linewidth = 1.5) +
xlim(0,400) +
#ylim(0,0.055) +
scale_color_viridis(discrete = T) +
theme(legend.position = "none")+
#facet_wrap(~generation) +
ggtitle("After summing coverage across sexes; autosomes only")+
geom_vline(xintercept = 100.98)
plot(plot_cov_merged_longer_autosomes)
## Warning: Removed 676 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 676 rows containing non-finite outside the scale range
## (`stat_density()`).
# convert to a plotly plot
interactive_cov_plot_merged_autosomes <- ggplotly(plot_cov_merged_longer_autosomes)
## Warning: Removed 676 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 676 rows containing non-finite outside the scale range
## (`stat_density()`).
interactive_cov_plot_merged_autosomes
Read the mpileup file with sex chromosome
# load coverage from the sex chromosome. Note that hash is not used as a comment character!
cov_sex <- read.table("~/sex_ratio_EE/results/5.create_sync_file/sexChr/EE_SR_IndelsRm_RepeatsRm_sexChromosome_subsampled.sync")
# remove N and del (last 2 digits from each cell)
remove_last_digits <- function(x) {
gsub(".{0,4}$", "", x)
}
cov_sex[4:ncol(cov_sex)] <- lapply(cov_sex[4:ncol(cov_sex)], remove_last_digits)
# add col names
colnames(cov_sex) <- c("LG", "pos", "ref_allele", sample_names)
# sum values within each cell (sum the coverage of all nucleotides)
sum_split_values <- function(x) {
sapply(strsplit(x, ":"), function(y) sum(as.numeric(y)))
}
# apply the function
cov_sex <- cov_sex %>%
mutate(across(contains("males"), sum_split_values))
# subset data and change format to longer
cov_longer_sexChr <- cov_sex %>%
arrange(LG, pos) %>% # sort the mpileup according to position
pivot_longer(cols = tidyselect::contains("male"), # change format to longer
names_to = c("line", "sex", "generation"),
names_sep = "_",
values_to = "coverage")%>%
mutate(line_generation = paste0(line, "_", generation, "_", sex)) %>%
filter(LG == "LG4") # remove sex chromosome
# calculate mean coverage per line:
cov_longer_sexChr %>%
group_by(line, generation, sex) %>%
summarise(mean_coverage = mean(coverage))
## `summarise()` has grouped output by 'line', 'generation'. You can override
## using the `.groups` argument.
## # A tibble: 32 × 4
## # Groups: line, generation [16]
## line generation sex mean_coverage
## <chr> <chr> <chr> <dbl>
## 1 FB-A F1 females 44.7
## 2 FB-A F1 males 18.2
## 3 FB-A F28 females 50.1
## 4 FB-A F28 males 27.2
## 5 FB-B F1 females 37.3
## 6 FB-B F1 males 18.9
## 7 FB-B F28 females 51.5
## 8 FB-B F28 males 28.0
## 9 FB-C F1 females 43.1
## 10 FB-C F1 males 17.3
## # ℹ 22 more rows
Target coverage = 50.41X
# plot female coverage of sex chromosome
cov_plot_sexChr <- cov_longer_sexChr %>%
filter(sex == "females") %>%
ggplot(aes(coverage)) +
geom_density(aes(color = line_generation)) +
geom_density(linewidth = 1.5) +
xlim(0,200) +
ylim(0,0.055) +
scale_color_viridis(discrete = T) +
theme(legend.position = "none")+
ggtitle("Sex chromosome, females") +
geom_vline(xintercept = 52.4)
plot(cov_plot_sexChr)
## Warning: Removed 144 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 144 rows containing non-finite outside the scale range
## (`stat_density()`).
# convert to a plotly plot
interactive_cov_plot_sexChr <- ggplotly(cov_plot_sexChr)
## Warning: Removed 144 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 144 rows containing non-finite outside the scale range
## (`stat_density()`).
interactive_cov_plot_sexChr
Male coverage of the sex chromosome Tagret coverage = 16.04X
# plot male coverage of sex chromosome
cov_plot_sexChr_males <- cov_longer_sexChr %>%
filter(sex == "males") %>%
ggplot(aes(coverage)) +
geom_density(aes(color = line_generation)) +
geom_density(linewidth = 1.5) +
xlim(0,200) +
ylim(0,0.055) +
scale_color_viridis(discrete = T) +
theme(legend.position = "none")+
ggtitle("Sex chromosome, males") +
geom_vline(xintercept = 19.6)
plot(cov_plot_sexChr_males)
## Warning: Removed 84 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 84 rows containing non-finite outside the scale range
## (`stat_density()`).
# convert to a plotly plot
interactive_cov_plot_sexChr_males <- ggplotly(cov_plot_sexChr_males)
## Warning: Removed 84 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 84 rows containing non-finite outside the scale range
## (`stat_density()`).
interactive_cov_plot_sexChr_males
The target coverage for the sex chromosome is 77.5X (vertical line) Therefore, the mean coverage across all lines has to be between 38X and 155X.
# cov_check is a sync file after merging male and female coverage
cov_merged_sex <- read.table("~/sex_ratio_EE/results/5.create_sync_file/sexChr/EE_SR_IndelsRm_RepeatsRm_sexMergedCov_sexChromosome_subsampled.sync")
# remove N and del (last 2 digits from each cell)
cov_merged_sex[4:ncol(cov_merged_sex)] <- lapply(cov_merged_sex[4:ncol(cov_merged_sex)], remove_last_digits)
# add col names
sample_names_check <- unique(c("FB-A_F1", "FB-A_F1", "FB-B_F1", "FB-B_F1", "FB-C_F1", "FB-C_F1" , "FB-D_F1", "FB-D_F1", "MB-A_F1", "MB-A_F1", "MB-B_F1", "MB-B_F1", "MB-C_F1", "MB-C_F1", "MB-D_F1", "MB-D_F1", "FB-A_F28", "FB-A_F28", "FB-B_F28", "FB-B_F28", "FB-C_F28", "FB-C_F28" , "FB-D_F28", "FB-D_F28", "MB-A_F28", "MB-A_F28", "MB-B_F28", "MB-B_F28", "MB-C_F28", "MB-C_F28", "MB-D_F28", "MB-D_F28"))
colnames(cov_merged_sex) <- c("LG", "pos", "ref_allele", sample_names_check)
# sum values within each cell (sum the coverage of all nucleotides)
cov_merged_sex <- cov_merged_sex %>%
mutate(across(contains("_F"), sum_split_values))
# change the format to longer
cov_merged_longer_sex <- cov_merged_sex %>%
arrange(LG, pos) %>% # sort the mpileup according to position
pivot_longer(cols = tidyselect::contains("_F"), # change format to longer
names_to = c("line", "generation"),
names_sep = "_",
values_to = "coverage")%>%
mutate(line_generation = paste0(line, "_", generation))
# plot coverage
plot_cov_merged_longer_sex <- ggplot(cov_merged_longer_sex, aes(coverage)) +
geom_density(aes(color = line_generation)) +
geom_density(linewidth = 1.5) +
xlim(0,400) +
#ylim(0,0.055) +
scale_color_viridis(discrete = T) +
theme(legend.position = "none")+
#facet_wrap(~generation) +
ggtitle("After summing coverage across sexes; sex chromosome only")+
geom_vline(xintercept = 77.5)
plot(plot_cov_merged_longer_sex)
## Warning: Removed 33 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 33 rows containing non-finite outside the scale range
## (`stat_density()`).
# convert to a plotly plot
interactive_cov_plot_merged_sex <- ggplotly(plot_cov_merged_longer_sex)
## Warning: Removed 33 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 33 rows containing non-finite outside the scale range
## (`stat_density()`).
interactive_cov_plot_merged_sex
# cov_check is a sync file after merging male and female coverage
cov_merged_covFilt <- read.table("~/sex_ratio_EE/results/6.filter_by_coverage/withoutAdditionalFiltering/autosomes/all_lines//EE_SR_IndelsRm_RepeatsRm_sexMergedCov_autosomes_filteredCov_subsampled.sync")
# remove N and del (last 2 digits from each cell)
cov_merged_covFilt[4:ncol(cov_merged_covFilt)] <- lapply(cov_merged_covFilt[4:ncol(cov_merged_covFilt)], remove_last_digits)
# add col names
sample_names_check <- unique(c("FB-A_F1", "FB-A_F1", "FB-B_F1", "FB-B_F1", "FB-C_F1", "FB-C_F1" , "FB-D_F1", "FB-D_F1", "MB-A_F1", "MB-A_F1", "MB-B_F1", "MB-B_F1", "MB-C_F1", "MB-C_F1", "MB-D_F1", "MB-D_F1", "FB-A_F28", "FB-A_F28", "FB-B_F28", "FB-B_F28", "FB-C_F28", "FB-C_F28" , "FB-D_F28", "FB-D_F28", "MB-A_F28", "MB-A_F28", "MB-B_F28", "MB-B_F28", "MB-C_F28", "MB-C_F28", "MB-D_F28", "MB-D_F28"))
colnames(cov_merged_covFilt) <- c("LG", "pos", "ref_allele", sample_names_check)
# sum values within each cell (sum the coverage of all nucleotides)
cov_merged_covFilt <- cov_merged_covFilt %>%
mutate(across(contains("_F"), sum_split_values))
# change the format to longer
cov_merged_covFilt_longer <- cov_merged_covFilt %>%
arrange(LG, pos) %>% # sort the mpileup according to position
pivot_longer(cols = tidyselect::contains("_F"), # change format to longer
names_to = c("line", "generation"),
names_sep = "_",
values_to = "coverage")%>%
mutate(line_generation = paste0(line, "_", generation))
# plot coverage
plot_cov_merged_covFilt_longer <- ggplot(cov_merged_covFilt_longer, aes(coverage)) +
geom_density(aes(color = line_generation)) +
geom_density(linewidth = 1.5) +
xlim(0,400) +
#ylim(0,0.055) +
scale_color_viridis(discrete = T) +
theme(legend.position = "none")+
#facet_wrap(~generation) +
ggtitle("After summing coverage across sexes; sex chromosome only", subtitle = "After filtering on coverage")+
geom_vline(xintercept = 107.24) +
geom_vline(xintercept = 107.24/2, linetype = "dashed") +
geom_vline(xintercept = 107.24 * 2, linetype = "dashed")
plot(plot_cov_merged_covFilt_longer)
# convert to a plotly plot
interactive_plot_cov_merged_covFilt_longer <- ggplotly(plot_cov_merged_covFilt_longer)
interactive_plot_cov_merged_covFilt_longer
cov_merged_covFilt_longer %>%
group_by(generation, line) %>%
summarise(mean_coverage_autosomes = mean(coverage))
## `summarise()` has grouped output by 'generation'. You can override using the
## `.groups` argument.
## # A tibble: 16 × 3
## # Groups: generation [2]
## generation line mean_coverage_autosomes
## <chr> <chr> <dbl>
## 1 F1 FB-A 106.
## 2 F1 FB-B 100.
## 3 F1 FB-C 106.
## 4 F1 FB-D 81.1
## 5 F1 MB-A 87.0
## 6 F1 MB-B 81.3
## 7 F1 MB-C 102.
## 8 F1 MB-D 120.
## 9 F28 FB-A 140.
## 10 F28 FB-B 151.
## 11 F28 FB-C 123.
## 12 F28 FB-D 127.
## 13 F28 MB-A 131.
## 14 F28 MB-B 147.
## 15 F28 MB-C 146.
## 16 F28 MB-D 141.
pos <- position_jitterdodge(jitter.width = 0.2, dodge.width = 0.5, seed = 2)
plot_cov <- cov_merged_covFilt_longer %>%
group_by(generation, line) %>%
summarise(mean_coverage_autosomes = mean(coverage)) %>%
mutate(treatment = ifelse(str_detect(line, "^F"), "female-biased", "male_biased")) %>%
ggplot(aes(generation, mean_coverage_autosomes, color = treatment, label = line)) +
geom_point(position = pos) +
ggrepel::geom_label_repel(position = pos) +
theme_minimal() +
theme(legend.position = "none")
## `summarise()` has grouped output by 'generation'. You can override using the
## `.groups` argument.
ggsave("~/sex_ratio_EE/results/5.create_sync_file/mean_coverage_per_line.png", plot = plot_cov, width = 10, height = 7, dpi = 300)
# cov_check is a sync file after merging male and female coverage
cov_merged_covFilt_sex <- read.table("~/sex_ratio_EE/results/6.filter_by_coverage/withoutAdditionalFiltering/sexChr/all_lines/EE_SR_IndelsRm_RepeatsRm_sexMergedCov_sexChromosome_filteredCov_subsampled.sync")
# remove N and del (last 2 digits from each cell)
cov_merged_covFilt_sex[4:ncol(cov_merged_covFilt_sex)] <- lapply(cov_merged_covFilt_sex[4:ncol(cov_merged_covFilt_sex)], remove_last_digits)
# add col names
sample_names_check <- unique(c("FB-A_F1", "FB-A_F1", "FB-B_F1", "FB-B_F1", "FB-C_F1", "FB-C_F1" , "FB-D_F1", "FB-D_F1", "MB-A_F1", "MB-A_F1", "MB-B_F1", "MB-B_F1", "MB-C_F1", "MB-C_F1", "MB-D_F1", "MB-D_F1", "FB-A_F28", "FB-A_F28", "FB-B_F28", "FB-B_F28", "FB-C_F28", "FB-C_F28" , "FB-D_F28", "FB-D_F28", "MB-A_F28", "MB-A_F28", "MB-B_F28", "MB-B_F28", "MB-C_F28", "MB-C_F28", "MB-D_F28", "MB-D_F28"))
colnames(cov_merged_covFilt_sex) <- c("LG", "pos", "ref_allele", sample_names_check)
# sum values within each cell (sum the coverage of all nucleotides)
cov_merged_covFilt_sex <- cov_merged_covFilt_sex %>%
mutate(across(contains("_F"), sum_split_values))
# change the format to longer
cov_merged_covFilt_sex_longer <- cov_merged_covFilt_sex %>%
arrange(LG, pos) %>% # sort the mpileup according to position
pivot_longer(cols = tidyselect::contains("_F"), # change format to longer
names_to = c("line", "generation"),
names_sep = "_",
values_to = "coverage")%>%
mutate(line_generation = paste0(line, "_", generation))
# plot coverage
plot_cov_merged_covFilt_sex_longer <- ggplot(cov_merged_covFilt_sex_longer, aes(coverage)) +
geom_density(aes(color = line_generation)) +
geom_density(linewidth = 1.5) +
xlim(0,400) +
#ylim(0,0.055) +
scale_color_viridis(discrete = T) +
theme(legend.position = "none")+
#facet_wrap(~generation) +
ggtitle("After summing coverage across sexes; sex chromosome only", subtitle = "After filtering on coverage")+
geom_vline(xintercept = 73.58) +
geom_vline(xintercept = 73.58/2, linetype = "dashed") +
geom_vline(xintercept = 73.58 * 2, linetype = "dashed")
plot(plot_cov_merged_covFilt_sex_longer)
# convert to a plotly plot
interactive_plot_cov_merged_covFilt_sex_longer <- ggplotly(plot_cov_merged_covFilt_sex_longer)
interactive_plot_cov_merged_covFilt_sex_longer